May 27th, 2020

Performance improvement

There are some tricks you can follow in order to get a faster code, we will use our Pi function from the previous topic:

sim <- function(l) {
  c <- rep(0,l); hits <- 0
  pow2 <- function(x) { x2 <- sqrt( x[1]*x[1]+x[2]*x[2] ); return(x2) }
  for(i in 1:l){
             x = runif(2,-1,1)
     if( pow2(x) <=1 ){
                  hits <- hits + 1
        }
        dens <- hits/i; pi_partial = dens*4; c[i] = pi_partial
    }
   return(c)
}

Performance improvement

the execution time of this function for 100,000 iterations is

size <- 100000
system.time(
   res <- sim(size)
)
##    user  system elapsed 
##   0.248   0.032   0.282

Vectorization

If we vectorize the code we obtain a better performance:

simv <- function(l) {
  set.seed(0.234234)
  x=runif(l); y=runif(l)
  z=sqrt(x^2 + y^2)
  resl <- length(which(z<=1))*4/length(z)
  return(resl)
}

Vectorization

size <- 100000
system.time(
  res <- simv(size)
)
##    user  system elapsed 
##   0.006   0.000   0.007

a message from this example is that loops are expensive in R and vectorization can help to improve the performance of the code.

Common vector operations: \(+\), \(-\), \(/\), \(*\), \(\% * \%\).

Memory pre-allocation

N <- 1E5
data1 <- 1 
system.time({
    for (j in 2:N) {
      data1 <- c(data1, data1[j-1] + sample(-5:5, size=1))
    }
  })
##    user  system elapsed 
##  14.093   0.106  14.200

Memory pre-allocation

data2 <- numeric(N)
data2[1] <- 1
system.time({
  for (j in 2:N) {
    data2[j] <- data2[j-1] + sample(-5:5, size=1)
  }
})
##    user  system elapsed 
##   0.427   0.000   0.427

This example shows that pre-allocating memory reduces the execution time.

Using Data Frames

data1 <- rnorm(1E4*1000)
dataf <- data.frame(data1)
system.time(data2 <- rowSums(dataf))
##    user  system elapsed 
##   0.203   0.136   0.339

Using Data Frames

data1 <- rnorm(1E4*1000)
dim(data1) <- c(1E4,1000)
system.time(data1 <- rowSums(data1))
##    user  system elapsed 
##   0.019   0.000   0.019

Then, it is more efficient to use matrices upon doing numerical calculations rather than Data Frames.

Different implementations of functions

Principal components analysis

J. Chem. Inf. Mod., 57, 826-834 (2017)

Different implementations of functions

data <- rnorm(1E5*100)
dim(data) <- c(1E5,100)
system.time(prcomp_data <- prcomp(data))
##    user  system elapsed 
##   1.879   0.220   2.103
system.time(princomp_data <- princomp(data))
##    user  system elapsed 
##   0.710   0.280   0.991

Compiling your functions

library(microbenchmark)
library(compiler)

sim <- function(l) {
  c <- rep(0,l); hits <- 0
  pow2 <- function(x) { x2 <- sqrt( x[1]*x[1]+x[2]*x[2] ); return(x2) }
  for(i in 1:l){
             x = runif(2,-1,1)
     if( pow2(x) <=1 ){
                  hits <- hits + 1
        }
        dens <- hits/i; pi_partial = dens*4; c[i] = pi_partial
         }
   
   return(c)
}

Compiling your functions

sim.comp0 <- cmpfun(sim, options=list(optimize=0))
sim.comp1 <- cmpfun(sim, options=list(optimize=1))
sim.comp2 <- cmpfun(sim, options=list(optimize=2))
sim.comp3 <- cmpfun(sim, options=list(optimize=3))

size <- 100000
bench <- microbenchmark(sim(size), sim.comp0(size), sim.comp1(size), sim.comp2(size), 
                        sim.comp3(size))

Compiling your functions

bench
## Unit: milliseconds
##             expr      min       lq     mean   median       uq      max neval
##        sim(size) 215.3545 224.7008 230.6092 229.4785 232.4571 273.6263   100
##  sim.comp0(size) 403.2480 419.1577 429.8163 429.1565 437.4987 485.0128   100
##  sim.comp1(size) 269.5253 284.3602 291.7589 290.8161 297.0802 329.3568   100
##  sim.comp2(size) 215.5185 224.8974 231.2170 230.0413 233.1783 289.4366   100
##  sim.comp3(size) 211.2544 219.2418 226.0091 224.7076 229.1453 269.4477   100
##   cld
##  ab  
##     d
##    c 
##   b  
##  a

Compiling your functions

visualize the results:

library(ggplot2)
autoplot(bench)

Violin Plot

Just in time compilation

library(compiler)
enableJIT(level=3)
## [1] 3
bench <- microbenchmark(sim(size))
bench
## Unit: milliseconds
##       expr     min       lq     mean   median       uq      max neval
##  sim(size) 225.043 229.9735 236.9216 232.4703 236.3822 400.0836   100

Advanced Topics (Kebnekaise):

Calling external functions

subroutine pifunc(n)
implicit none
integer, parameter :: seed = 86456
integer     :: i,n,hits
real        :: x,y,r,pival
call srand(seed)
hits = 0
do i=1,n
   x = rand()
   y = rand()
   r = sqrt(x*x + y*y)
   if(r <= 1) then 
       hits = hits + 1
   endif
enddo
pival = 4.0d0*hits/(1.0*n)
write(6,*) pival,n
end subroutine pifunc

Advanced Topics (Kebnekaise):

Calling external functions

One compiles the function using standard compilers (Linux, Kebnekaise):

gfortran -shared -fPIC -o picalc pi.f90
size <- 100000

dyn.load("pi.so")
is.loaded("pifunc")
## [1] TRUE
.Fortran("pifunc", n = as.integer(size))
## $n
## [1] 100000

Advanced Topics (Kebnekaise):

Calling external functions

now we can benchmark our functions:

library(microbenchmark)
bench <- microbenchmark(sim(size), .Fortran("pifunc", n = as.integer(size)))
bench
## Unit: milliseconds
##                                      expr        min        lq       mean
##                                 sim(size) 223.418847 229.73806 238.168545
##  .Fortran("pifunc", n = as.integer(size))   3.786777   4.04951   4.094266
##      median         uq        max neval cld
##  232.300850 240.229816 399.633737   100   b
##    4.147272   4.175144   4.464601   100  a

Advanced Topics (Kebnekaise):

BLAS/LAPACK libraries

On Kebnekaise the OpenBLAS libraries are available:

sessionInfo()

R version 3.6.0 (2019-04-26)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 16.04.6 LTS

Matrix products: default
BLAS/LAPACK: /cvmfs/.../8.2.0-2.31.1/OpenBLAS/0.3.5/lib/libopenblas_haswellp-r0.3.5.so

Advanced Topics (Kebnekaise):

BLAS/LAPACK libraries

The number of threads can be controlled with the RhpcBLASctl package and setting the number of threads:

library(RhpcBLASctl)
n <- 5000; nsim <- 3                 #matrix size nxn; nr. independent simulations
set.seed(123); summa <- 0; x <- 0; blas_set_num_threads(1)  #set the number of threads
for (i in 1:nsim) {
  m <- matrix(rnorm(n^2), n); a <- crossprod(m)   #random matrix and symmetrize it
  timing <- system.time({
    x <- eigen(a, symmetric=TRUE, only.values=TRUE)$values[1]   #compute eigenvalues
  })[3] ;  summa <- summa + timing
} ; times <- summa/nsim
cat(c("Computation of eig. random matrix 5000x5000 (sec): ", times, "\n"))
## Computation of eig. random matrix 5000x5000 (sec):  15.613

Advanced Topics (Kebnekaise):

BLAS/LAPACK libraries

### Using 2 threads
blas_set_num_threads(2)         #set the number of threads
set.seed(123); summa <- 0; x <- 0              #partial values
for (i in 1:nsim) {
  m <- matrix(rnorm(n^2), n)    #random matrix
  a <- crossprod(m)             #symmetrize the random matrix
  timing <- system.time({
    x <- eigen(a, symmetric=TRUE, only.values=TRUE)$values[1]   #compute eigenvalues
  })[3]
  summa <- summa + timing
}
times <- summa/nsim
cat(c("Computation of eig. random matrix 5000x5000 (sec): ", times, "\n"))
## Computation of eig. random matrix 5000x5000 (sec):  8.06333333333333

References